
# V table A11 : train-test detailed summary stat ----------------------------------------


mean_with_share_miss <- function(x){
  #recieves columns, returns mean and share of non-NA data with nice syntax
  paste0(round(mean(x, na.rm=T),1), 
         " (", 
         round(1 - sum(is.na(x))/length(x),3)*100,
         ")")
}

mean_with_share_zero <- function(x){
  #recieves columns, returns mean and share of zero data with nice syntax
  paste0(round(mean(x, na.rm=T),1), 
         " (", 
         round(1 - sum(x==0)/length(x),3)*100,
         ")")
}

mean_with_share_zero_comma <- function(x){
  #recieves columns, returns mean and share of zero data with nice syntax
  paste0(scales::comma(round(mean(x, na.rm=T),0)), 
         " (", 
         round(1 - sum(x==0)/length(x),3)*100,
         ")")
}


desc_stats_func_2 <- function(data, sample_txt, group_txt) {
  t <- data[, .(sample      = sample_txt,
                group       = group_txt,
                obs_num     = scales::comma(length(unique(obs_uniq_ident))),
                # outcome
                died_pct   = mean( DMG_died_within_365d == "1"),
                #Demog
                age_avg     = scales::comma(round(mean(DMG_age), 1)),
                female_pct  = mean(DMG_gender == "F"),
                arabs_pct   = mean(DMG_clinic_ethnicity == "arab"),
                sup_ins_pct = mean(DMG_supplementary_insurance == "1"),
                confined_pct= mean(COV_sw_confined == "1"),
                #Chronic 
                CHR_HYPERLIPIDEMIA_pct = mean (CHR_HYPERLIPIDEMIA  == "1"), 
                CHR_Hypertension_pct = mean (CHR_Hypertension  == "1"), 
                CHR_ARTHROPATHY_pct = mean (CHR_ARTHROPATHY  == "1"), 
                CHR_Diabetes_pct = mean (CHR_Diabetes  == "1"), 
                CHR_IHD_pct                        = mean (CHR_IHD   == "1"),
                CHR_Arrhythmia_pct = mean (CHR_Arrhythmia == "1"),
                CHR_Neurological_pct = mean (CHR_Neurological == "1"),
                CHR_Kidney_pct = mean (CHR_Kidney   == "1"),  
                CHR_Gastritis_pct = mean (CHR_Gastritis  == "1"), 
                CHR_CRF_pct                        = mean (CHR_CRF == "1"),
                CHR_OSTEOPOROSIS_pct = mean (CHR_OSTEOPOROSIS == "1"),
                CHR_CVA_pct = mean (CHR_CVA       == "1"),      
                CHR_DEPRESSION_pct = mean (CHR_DEPRESSION  == "1"), 
                CHR_ValvularCardiac_pct = mean (CHR_ValvularCardiac  == "1"), 
                CHR_CHF_pct = mean (CHR_CHF  == "1"), 
                CHR_COPD_pct = mean (CHR_COPD == "1"),
                # prior util  
                UTL_l365d_drugs_count = mean_with_share_zero(UTL_l365d_drugs_count),
                UTL_l365d_labs_count = mean_with_share_zero(UTL_l365d_labs_count),
                UTL_l365d_imaging_count = mean_with_share_zero(UTL_l365d_imaging_count),
                UTL_l365d_abm_count =   paste0(round(mean( (UTL_l365d_ambuDiag_proc_count + UTL_l365d_ambuTreat_proc_count + 
                                                              UTL_l365d_dayHospSurg_cost    + UTL_l365d_dayHospNonSurg_cost ),
                                                           na.rm=T),1), 
                                               " (", 
                                               round(1 - sum((UTL_l365d_ambuDiag_proc_count + UTL_l365d_ambuTreat_proc_count + 
                                                                UTL_l365d_dayHospSurg_cost    + UTL_l365d_dayHospNonSurg_cost)==0)/.N,3)*100,
                                               ")"),
                UTL_l365d_ER_count =     mean_with_share_zero(UTL_l365d_ER_count),
                UTL_l365d_HOSP_count =    paste0(round(mean(  (UTL_f365d_diff_count + UTL_f365d_hospPlanned_count + UTL_f365d_hospUnplanned_count) ,
                                                              na.rm=T),0), 
                                                 " (", 
                                                 round(1 - sum((UTL_f365d_diff_count + UTL_f365d_hospPlanned_count + UTL_f365d_hospUnplanned_count)==0)/.N,3)*100,
                                                 ")"),
                # util 
                cost_1yrBef =   mean_with_share_zero_comma(UTL_l365d_total_cost), 
                #  ACG score 
                ACG_RUB_low_pct      =  sum(  ACG_RUB %in% c("0", "1","2") )/sum( (ACG_RUB != "Missing") ),
                ACG_RUB_moderate_pct =  sum(  ACG_RUB %in% c("3") )/sum( (ACG_RUB != "Missing") ),
                ACG_RUB_High_pct     =  sum(  ACG_RUB %in% c("4", "5") )/sum( (ACG_RUB != "Missing") ),
                
                COV_bmi_value            = 	mean_with_share_miss( COV_bmi_value)	,
                COV_bp_dias              =  mean_with_share_miss( COV_bp_dias_last)	,
                COV_bp_sys               =  mean_with_share_miss( COV_bp_sys_last)	,
                #  Lab measurments  
                LAB_HB_last						   = 	mean_with_share_miss( BT_HB_last_val_num)	,
                LAB_HCT_last						= 	mean_with_share_miss(BT_HCT_HGB_last_val_num )	,	
                LAB_RBC_last						= 	mean_with_share_miss( BT_RBC_last_val_num)	,	
                LAB_PLT_last						= 	mean_with_share_miss( BT_PLT_last_val_num)	,	 
                LAB_NEUT_abs_EHR_last		= 	mean_with_share_miss(BT_NEUT_abs_last_val_num )	,	  
                LAB_LYMP_abs_EHR_last		= 	mean_with_share_miss( BT_LYMP_abs_last_val_num)		 
  )]
  num_to_pct <- grep("pct", names(t), value=T)
  t[, (num_to_pct) := lapply(.SD,  
                             function(x) {(round(x*100, 1))}),  .SDcols = num_to_pct]
}

do_Select_Predictors <- function(data_cnr = dt_cnr) { 
  test_set_pct <- (1/2)   # of the whole sample
modulo_type <- as.integer(1/test_set_pct)
data_cnr[, hashed_id_num := as.numeric(substr(
  gsub("[^0-9]", "", openssl::md5(as.character(id_var))), 
  1, 
  4))]

unif_desc_stats_2<- rbind(
  desc_stats_func_2(data_cnr[hashed_id_num%%modulo_type == 0], 
                    "Cancer", "Train"),
  desc_stats_func_2(data_cnr[hashed_id_num%%modulo_type != 0], 
                    "Cancer", "Test")
)


unif_desc_stats_2_t <- 
  as.data.table(t(unif_desc_stats_2),keep.rownames = T)

names(unif_desc_stats_2_t)<- paste0(unif_desc_stats_2_t[1,],
                                    "_",
                                    unif_desc_stats_2_t[2,])

# drop uninformative cells 

names_table_1detailed <- 
  c("Number of Beneficiaries","1-Year All-Cause Mortality (\\%)",
    "Age (mean) (minimum = 25) (y)", "Sex (\\% Female)", "Ethnicity (\\% Arabs)",
    "Supplementary Insurance (\\%)","Disability (\\%)",
    "Hyperlipidemia", "Hypertension", "Arthropathy" ,"Diabetes",
    "IHD","Arrhythmia","Neurological","Kidney","Gastritis","CRF",
    "Osteoporosis","CVA","Depression","Valvular Cardiac","CHF","COPD",
    "Prescription Drugs", "Laboratory Tests", "Imaging Events",
    "Ambulatory Encounters", "Emergency Room Visits", "Hospital Visits",
    "Total Spending (NIS)",
    "Healthy or Low","Moderate","High or Very High",
    "BMI", "Diastolic Blood Pressure (mm Hg)", "Systolic Blood Pressure (mm Hg)",
    "Hemoglobin (g/dL)", "Hematocrit, (\\%)", "Red Blood Cells",
    "Platelets  (1000/uL)","Neutrophiles" , "Lymphocytes")

invisible(Hmisc::latex(
  unif_desc_stats_2_t[-c(1,2),-1 ] ,
  file = "desc_stats_train_test.tex",
  center = 'centering',
  colheads = c("Train Set", "Test Set"),
  n.rgroup = c(1, 1, 5, 16, 6,1,3, 9),
  rgroup = c("Sample Size", "Outcomes", "Demographics", 
             "Chronic Conditions,$\\dagger$ \\%",
             "Prior Utilization, mean 1yr count (\\% non zero)", 
             "Prior Utilization, mean 1yr cost (\\% non zero)", 
             "ACG Score,*", 
             "Clinical Measurements$\\dagger$, last measurement, mean (\\% non missing)"),
  rowname =  names_table_1detailed,
  rowlabel = "",  
  col.just = c("l",rep.int("r", 2)),
  extracolheads = c("(1)", "(2)"),
  na.blank = TRUE, 
  extracolsize = "normalsize"
))
return(unif_desc_stats_2_t)
}

